import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
from matplotlib import rcParams
import re
from wordcloud import WordCloud
from collections import Counter
import csv
from matplotlib import rcParams
from nltk.corpus import stopwords
import nltk
from nltk.util import ngrams
stop = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,plot_confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
true = pd.read_csv("input/True.csv")
false = pd.read_csv("input/Fake.csv")
true.head()
| title | text | subject | date | |
|---|---|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | WASHINGTON (Reuters) - The head of a conservat... | politicsNews | December 31, 2017 |
| 1 | U.S. military to accept transgender recruits o... | WASHINGTON (Reuters) - Transgender people will... | politicsNews | December 29, 2017 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | WASHINGTON (Reuters) - The special counsel inv... | politicsNews | December 31, 2017 |
| 3 | FBI Russia probe helped by Australian diplomat... | WASHINGTON (Reuters) - Trump campaign adviser ... | politicsNews | December 30, 2017 |
| 4 | Trump wants Postal Service to charge 'much mor... | SEATTLE/WASHINGTON (Reuters) - President Donal... | politicsNews | December 29, 2017 |
false.head()
| title | text | subject | date | |
|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | News | December 31, 2017 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | News | December 31, 2017 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | News | December 30, 2017 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | News | December 29, 2017 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | News | December 25, 2017 |
true.subject.value_counts()
politicsNews 11272 worldnews 10145 Name: subject, dtype: int64
rcParams['figure.figsize'] = 15,10
true.subject.value_counts().plot(kind="bar")
<AxesSubplot:>
politics = true[true['subject']=="politicsNews"]
worldnews = true[true['subject']=="worldnews"]
print(politics.shape)
print(worldnews.shape)
(11272, 4) (10145, 4)
politics_text_len = politics['text'].str.len()
worldnews_text_len = worldnews['text'].str.len()
print("The maximum lenght of string in Politcs news is {} words".format(max(politics_text_len)))
print("The maximum lenght of string in World news is {} words".format(max(worldnews_text_len)))
The maximum lenght of string in Politcs news is 29781 words The maximum lenght of string in World news is 17999 words
def tokenizeandstopwords(text):
tokens = nltk.word_tokenize(text)
# taken only words (not punctuation)
token_words = [w for w in tokens if w.isalpha()]
meaningful_words = [w for w in token_words if not w in stop]
joined_words = ( " ".join(meaningful_words))
return joined_words
politics['text'] = politics['text'].apply(tokenizeandstopwords)
worldnews['text'] = worldnews['text'].apply(tokenizeandstopwords)
<ipython-input-31-054355159e10>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy politics['text'] = politics['text'].apply(tokenizeandstopwords) <ipython-input-31-054355159e10>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy worldnews['text'] = worldnews['text'].apply(tokenizeandstopwords)
def generate_word_cloud(text):
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black').generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
politics_text = politics.text.values
generate_word_cloud(politics_text)
false.head()
| title | text | subject | date | |
|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | News | December 31, 2017 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | News | December 31, 2017 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | News | December 30, 2017 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | News | December 29, 2017 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | News | December 25, 2017 |
set(false.subject)
{'Government News', 'Middle-east', 'News', 'US_News', 'left-news', 'politics'}
true.subject.value_counts()
politicsNews 11272 worldnews 10145 Name: subject, dtype: int64
false.subject.value_counts()
News 9050 politics 6841 left-news 4459 Government News 1570 US_News 783 Middle-east 778 Name: subject, dtype: int64
Government_News = false[false['subject']=="Government News"]
Middle_east = false[false['subject']=="Middle-east"]
News = false[false['subject']=="News"]
US_News = false[false['subject']=="US_News"]
politics = false[false['subject']=="politics"]
Government_News['text'] = Government_News['text'].apply(tokenizeandstopwords)
Middle_east['text'] = Middle_east['text'].apply(tokenizeandstopwords)
News['text'] = News['text'].apply(tokenizeandstopwords)
US_News['text'] = US_News['text'].apply(tokenizeandstopwords)
politics['text'] = politics['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Government_News['text'] = Government_News['text'].apply(tokenizeandstopwords) <ipython-input-39-e07793f1b127>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Middle_east['text'] = Middle_east['text'].apply(tokenizeandstopwords) <ipython-input-39-e07793f1b127>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy News['text'] = News['text'].apply(tokenizeandstopwords) <ipython-input-39-e07793f1b127>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy US_News['text'] = US_News['text'].apply(tokenizeandstopwords) <ipython-input-39-e07793f1b127>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy politics['text'] = politics['text'].apply(tokenizeandstopwords)
govertment_news_text = Government_News['text'].values
generate_word_cloud(govertment_news_text)
false['target'] = 'fake'
true['target'] = 'true'
news = pd.concat([false, true]).reset_index(drop = True)
news.head()
| title | text | subject | date | target | |
|---|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | News | December 31, 2017 | fake |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | News | December 31, 2017 | fake |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | News | December 30, 2017 | fake |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | News | December 29, 2017 | fake |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | News | December 25, 2017 | fake |
news['text'] = news['text'].apply((lambda y:re.sub("http://\S+"," ", y)))
news['text'] = news['text'].apply((lambda x:re.sub("\@", " ",x.lower())))
news.head()
| title | text | subject | date | target | |
|---|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | donald trump just couldn t wish all americans ... | News | December 31, 2017 | fake |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | house intelligence committee chairman devin nu... | News | December 31, 2017 | fake |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | on friday, it was revealed that former milwauk... | News | December 30, 2017 | fake |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | on christmas day, donald trump announced that ... | News | December 29, 2017 | fake |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | pope francis used his annual christmas day mes... | News | December 25, 2017 | fake |
def basic_clean(text):
"""
A simple function to clean up the data. All the words that
are not designated as a stop word is then lemmatized after
encoding and basic regex parsing are performed.
"""
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
words = re.sub(r'[^\w\s]', '', text).split()
return [wnl.lemmatize(word) for word in words if word not in stopwords]
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Bima\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\wordnet.zip.
True
true_word = basic_clean(''.join(str(true['text'].tolist())))
x_train,x_test,y_train,y_test = train_test_split(news['text'], news.target, test_size=0.2, random_state=2020)
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', LogisticRegression())])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
accuracy: 98.76%
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))
[[4674 66]
[ 45 4195]]
precision recall f1-score support
fake 0.99 0.99 0.99 4740
true 0.98 0.99 0.99 4240
accuracy 0.99 8980
macro avg 0.99 0.99 0.99 8980
weighted avg 0.99 0.99 0.99 8980
plot_confusion_matrix(model,x_test,y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1945d4b4ca0>